Introduction

Packages used

library(tidyverse)
library(data.table)
library(caret)
library(ggthemes)
library(lubridate)
library(DT)

Reading data

trainperf <- fread("trainperf.csv")
traindemographics <-fread("traindemographics.csv")
testperf <-fread("testperf.csv")
testdemographics <-fread("testdemographics.csv")
SampleSubmission <- fread("SampleSubmission.csv")

Combine test set and train set

  • makes it easier for cleaning purposes
train_data <- merge(traindemographics, trainperf, all.y = T, by = "customerid")
test_data <- merge(testdemographics, testperf, all.y = T, by = "customerid")
loan_data <- rbind(train_data[, set := "train"], test_data[, set := "test"], fill = T)

loan_data %>% head() %>%
  datatable(options = list(scrollX= TRUE))

Feature engineering

Dates

  • Create new variables from dates eg age, loan year, loan day etc
dates <- c("birthdate" ,"approveddate", "creationdate" )
loan_data[, (dates) := lapply(.SD, as.Date), .SDcols = dates]
loan_data[, age := (as.numeric(approveddate - birthdate))/365]
loan_data[, aprove_month := month(approveddate)]
loan_data[, approve_day := wday(approveddate)]
loan_data[, approve_year := year(approveddate)]

Bad loans distribution

loan_data[!is.na(good_bad_flag), .N, by = .(good_bad_flag)] %>%
    .[, perc := round(N/sum(N) * 100, 2)] %>%
    
     ggplot(aes(good_bad_flag, perc, fill =good_bad_flag)) +
     geom_bar(stat = "identity") +
     geom_text(aes(good_bad_flag, perc, label = paste(perc, "%"),
                   vjust = .05, hjust = .5),
               size = 4)+
     theme_hc()+
    labs(title = "Percentage of bad loans")+
     scale_fill_colorblind(name = "")+
    theme(legend.position = "none")

Some cleaning

Clean string variables

  • convert empty chars to NA
chars <- c("bank_account_type", "bank_name_clients", 
           "bank_branch_clients", "employment_status_clients",
           "level_of_education_clients")

loan_data[, (chars) := lapply(.SD, function(x) ifelse(x == "" | x == " ", NA, x)), .SDcols = chars]

Missing values distribution

naVals <- colSums(is.na(loan_data))/nrow(loan_data) * 100 

withNa <- naVals[naVals>0]
nms_na <- names(withNa)
missing_perc <- data.table(variables = nms_na, perc = withNa) 

Missing values distribution plot

ggplot(missing_perc, aes( reorder(variables, perc), perc))+
    geom_bar(stat = "identity") +
    theme_fivethirtyeight()+
    coord_flip()

KNN imputation

loan_data[, loannumber := as.numeric(loannumber)]
missing_var_del <- missing_perc[perc>50, variables]
## KNN imputation
library(VIM)
loan_data[, (dates):= NULL]
loan_data[, referredby:= NULL]
loan_data <- kNN(loan_data,useImputedDist = FALSE, k =10)

setDT(loan_data)
nms_all <- names(loan_data)
nms_imp <- nms_all[grepl("_imp$", nms_all)]


loan_data[, (nms_imp) := lapply(.SD, 
                            function(x) ifelse(x == FALSE, 0, 1)),
      .SDcols = nms_imp]

col_sum_imp <- loan_data[, colSums(.SD), .SDcols = nms_imp]
col_sum_imp <- names(col_sum_imp[col_sum_imp == 0])
#var_importants <- fread("var_importanta.csv")
loan_data[, (col_sum_imp) := NULL]

loan_data %>% head() %>%
  datatable(options = list(scrollX= TRUE))

More cleaning

  • eg standardizing
loan_data[, good_bad_flag := factor(good_bad_flag, levels = c("Bad", "Good"))]

nms_del1 <- c("set_imp", " good_bad_flag_imp", 
              "approve_year","aprove_month", 
              "year","systemloanid" )

loan_data[, (nms_del1) := NULL]

class_nms <- sapply(loan_data, class)
nums <- class_nms[class_nms == "numeric"] %>% names()
nums <- nums[!grepl("_imp|good_bad_flag", nums)]

zero_one <- function(x){
    
    myvar <- (x - min(x))/(max(x) - min(x))
    
    myvar
}


loan_data[, (nums) := lapply(.SD, zero_one), .SDcols = nums]


train_data <- loan_data[set == "train"]
train_data[, set:= NULL]
test_data <- loan_data[set == "test"]
test_data[, set:= NULL]

Imbalanced Datasets

  • make the proportion of good loans to be the same as that of bad loans
train_bad <- train_data[good_bad_flag == "Bad"]
train_good <- train_data[good_bad_flag == "Good"]
n_row = nrow(train_good)
n_row_dead = nrow(train_bad)

set.seed(200)
not_sample <- sample(1:n_row, n_row_dead)
train_good <- train_good[not_sample]
train_sampled <- rbind(train_bad, train_good)

Training Data

Tuning parameters and validation sets

## Model Cross validation

set.seed(100)
cv_fold <- createFolds(train_sampled$good_bad_flag, k = 10)

train_ctrl <- trainControl(method = "cv",
                        number = 10,
                        summaryFunction = twoClassSummary,
                        classProbs = TRUE,
                        allowParallel=T,
                        index = cv_fold,
                        verboseIter = FALSE,
                        savePredictions = TRUE,
                        search = "grid")


xgb_grid <- expand.grid(nrounds = c(50,100),
                        eta = 0.4,
                        max_depth = c(2,3),
                        gamma = c(0, .01),
                        colsample_bytree = c(0.6, .8, 1),
                        min_child_weight = 1,
                        subsample =  c(.5, .8, 1))

 
ranger_grid <- expand.grid(splitrule = c("extratrees", "gini"),
                        mtry = c(10, 20, (ncol(train_data) - 2) ),
                        min.node.size = c(1, 5))

svm_grid <- expand.grid(C = c( 1, 3, 5, 20),
                        sigma = seq(0.001, 0.524 , length.out = 7))

Model Fitting

library(caret)
library(caretEnsemble)
library(tictoc)
#tuneGrid= xgb_grid
tic()

model_list <- caretList(
   good_bad_flag~.,
    data=train_sampled[, .SD, .SDcols = !"customerid"],
    metric = "ROC",
    trControl=train_ctrl,
    tuneList = list(caretModelSpec(method="xgbTree",tuneGrid= xgb_grid ),
                    caretModelSpec(method = "svmRadial", tuneGrid = svm_grid),
                    caretModelSpec(method="ranger", tuneGrid= ranger_grid)

                   )
)

toc()
## 184.56 sec elapsed

Model Output

model_list
## $xgbTree
## eXtreme Gradient Boosting 
## 
## 1906 samples
##   25 predictor
##    2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ... 
## Resampling results across tuning parameters:
## 
##   max_depth  gamma  colsample_bytree  subsample  nrounds  ROC        Sens     
##   2          0.00   0.6               0.5         50      0.6728847  0.6255205
##   2          0.00   0.6               0.5        100      0.6687239  0.6304200
##   2          0.00   0.6               0.8         50      0.7288039  0.6641100
##   2          0.00   0.6               0.8        100      0.7116424  0.6563050
##   2          0.00   0.6               1.0         50      0.7561605  0.6733212
##   2          0.00   0.6               1.0        100      0.7316727  0.6772908
##   2          0.00   0.8               0.5         50      0.6729151  0.6205124
##   2          0.00   0.8               0.5        100      0.6624098  0.6245945
##   2          0.00   0.8               0.8         50      0.7327469  0.6672589
##   2          0.00   0.8               0.8        100      0.7079180  0.6519899
##   2          0.00   0.8               1.0         50      0.7532971  0.6749470
##   2          0.00   0.8               1.0        100      0.7320716  0.6788064
##   2          0.00   1.0               0.5         50      0.6698014  0.6229604
##   2          0.00   1.0               0.5        100      0.6593348  0.6231912
##   2          0.00   1.0               0.8         50      0.7278392  0.6629464
##   2          0.00   1.0               0.8        100      0.7079542  0.6654007
##   2          0.00   1.0               1.0         50      0.7502369  0.6764688
##   2          0.00   1.0               1.0        100      0.7289987  0.6756579
##   2          0.01   0.6               0.5         50      0.6696325  0.6348530
##   2          0.01   0.6               0.5        100      0.6660304  0.6332225
##   2          0.01   0.6               0.8         50      0.7350222  0.6693601
##   2          0.01   0.6               0.8        100      0.7153165  0.6644654
##   2          0.01   0.6               1.0         50      0.7580501  0.6747213
##   2          0.01   0.6               1.0        100      0.7379402  0.6802074
##   2          0.01   0.8               0.5         50      0.6654309  0.6214350
##   2          0.01   0.8               0.5        100      0.6561968  0.6222613
##   2          0.01   0.8               0.8         50      0.7339612  0.6737868
##   2          0.01   0.8               0.8        100      0.7155544  0.6560712
##   2          0.01   0.8               1.0         50      0.7548866  0.6721542
##   2          0.01   0.8               1.0        100      0.7308095  0.6753082
##   2          0.01   1.0               0.5         50      0.6527683  0.6149096
##   2          0.01   1.0               0.5        100      0.6568427  0.6309993
##   2          0.01   1.0               0.8         50      0.7240758  0.6685423
##   2          0.01   1.0               0.8        100      0.7079965  0.6636481
##   2          0.01   1.0               1.0         50      0.7502187  0.6768185
##   2          0.01   1.0               1.0        100      0.7292334  0.6763572
##   3          0.00   0.6               0.5         50      0.6577138  0.6098955
##   3          0.00   0.6               0.5        100      0.6579090  0.6192249
##   3          0.00   0.6               0.8         50      0.7110202  0.6627174
##   3          0.00   0.6               0.8        100      0.6925403  0.6455771
##   3          0.00   0.6               1.0         50      0.7329705  0.6760058
##   3          0.00   0.6               1.0        100      0.7119700  0.6608549
##   3          0.00   0.8               0.5         50      0.6501829  0.6171174
##   3          0.00   0.8               0.5        100      0.6461245  0.6104793
##   3          0.00   0.8               0.8         50      0.7057669  0.6446497
##   3          0.00   0.8               0.8        100      0.6866213  0.6469789
##   3          0.00   0.8               1.0         50      0.7270526  0.6676116
##   3          0.00   0.8               1.0        100      0.7052232  0.6558381
##   3          0.00   1.0               0.5         50      0.6535350  0.6103603
##   3          0.00   1.0               0.5        100      0.6445891  0.6115307
##   3          0.00   1.0               0.8         50      0.6981641  0.6418469
##   3          0.00   1.0               0.8        100      0.6829766  0.6350843
##   3          0.00   1.0               1.0         50      0.7291631  0.6697104
##   3          0.00   1.0               1.0        100      0.7071115  0.6565366
##   3          0.01   0.6               0.5         50      0.6575633  0.6202778
##   3          0.01   0.6               0.5        100      0.6453809  0.6187602
##   3          0.01   0.6               0.8         50      0.7092909  0.6593326
##   3          0.01   0.6               0.8        100      0.6908117  0.6421964
##   3          0.01   0.6               1.0         50      0.7336854  0.6659801
##   3          0.01   0.6               1.0        100      0.7100205  0.6580553
##   3          0.01   0.8               0.5         50      0.6556257  0.6185275
##   3          0.01   0.8               0.5        100      0.6452680  0.6172464
##   3          0.01   0.8               0.8         50      0.7082274  0.6484968
##   3          0.01   0.8               0.8        100      0.6888123  0.6404494
##   3          0.01   0.8               1.0         50      0.7315376  0.6728584
##   3          0.01   0.8               1.0        100      0.7105410  0.6594552
##   3          0.01   1.0               0.5         50      0.6597036  0.6172513
##   3          0.01   1.0               0.5        100      0.6513857  0.6135165
##   3          0.01   1.0               0.8         50      0.7118478  0.6603852
##   3          0.01   1.0               0.8        100      0.6903015  0.6481455
##   3          0.01   1.0               1.0         50      0.7287420  0.6681953
##   3          0.01   1.0               1.0        100      0.7071734  0.6564200
##   Spec     
##   0.6156090
##   0.6135073
##   0.6670262
##   0.6426578
##   0.7096938
##   0.6680748
##   0.6271593
##   0.6032520
##   0.6855564
##   0.6490688
##   0.7011851
##   0.6690106
##   0.6214418
##   0.6013809
##   0.6681879
##   0.6383447
##   0.6995523
##   0.6614318
##   0.6115329
##   0.5997547
##   0.6782133
##   0.6530311
##   0.7123787
##   0.6718071
##   0.6200204
##   0.6034756
##   0.6744838
##   0.6578180
##   0.7023487
##   0.6663226
##   0.6084888
##   0.5983467
##   0.6576949
##   0.6441748
##   0.6993192
##   0.6622477
##   0.6097796
##   0.6032469
##   0.6533812
##   0.6336797
##   0.6691205
##   0.6439431
##   0.6018496
##   0.5978856
##   0.6468594
##   0.6228411
##   0.6630656
##   0.6413769
##   0.6091973
##   0.5891455
##   0.6442915
##   0.6222532
##   0.6721569
##   0.6525701
##   0.6027784
##   0.5900682
##   0.6405579
##   0.6302996
##   0.6755362
##   0.6476725
##   0.6101219
##   0.5898378
##   0.6526839
##   0.6329835
##   0.6687769
##   0.6511728
##   0.6166631
##   0.6026713
##   0.6541987
##   0.6307646
##   0.6721568
##   0.6526862
## 
## Tuning parameter 'eta' was held constant at a value of 0.4
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, max_depth = 2, eta
##  = 0.4, gamma = 0.01, colsample_bytree = 0.6, min_child_weight = 1
##  and subsample = 1.
## 
## $svmRadial
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 1906 samples
##   25 predictor
##    2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ... 
## Resampling results across tuning parameters:
## 
##   C   sigma       ROC        Sens       Spec     
##    1  0.00100000  0.4422421  0.6618623  0.3382454
##    1  0.08816667  0.7086939  0.6597959  0.6335690
##    1  0.17533333  0.7352082  0.6774006  0.6674958
##    1  0.26250000  0.6962665  0.7022576  0.6216990
##    1  0.34966667  0.7400808  0.6733189  0.6819551
##    1  0.43683333  0.7373153  0.6618844  0.6887111
##    1  0.52400000  0.7342219  0.6515168  0.7003736
##    3  0.00100000  0.5142799  0.3957886  0.6104006
##    3  0.08816667  0.7690665  0.6757621  0.7382671
##    3  0.17533333  0.7623650  0.6896374  0.7107533
##    3  0.26250000  0.7537839  0.6913852  0.6899916
##    3  0.34966667  0.7465813  0.6841606  0.6867337
##    3  0.43683333  0.7405400  0.6820668  0.6756531
##    3  0.52400000  0.7350261  0.6539637  0.6959443
##    5  0.00100000  0.5111336  0.4664782  0.5415850
##    5  0.08816667  0.7713958  0.6841584  0.7343014
##    5  0.17533333  0.7573858  0.6927878  0.7027055
##    5  0.26250000  0.7484558  0.6905698  0.6826452
##    5  0.34966667  0.7422423  0.6734292  0.6884862
##    5  0.43683333  0.7366419  0.6821707  0.6678409
##    5  0.52400000  0.7323630  0.6729630  0.6736692
##   20  0.00100000  0.6034719  0.5483381  0.5842606
##   20  0.08816667  0.7524496  0.6948853  0.6916302
##   20  0.17533333  0.7440833  0.6894071  0.6758915
##   20  0.26250000  0.7390002  0.6813623  0.6771745
##   20  0.34966667  0.7343741  0.6814779  0.6679639
##   20  0.43683333  0.7307089  0.6771677  0.6664536
##   20  0.52400000  0.7266876  0.6686620  0.6708850
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.08816667 and C = 5.
## 
## $ranger
## Random Forest 
## 
## 1906 samples
##   25 predictor
##    2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ... 
## Resampling results across tuning parameters:
## 
##   splitrule   mtry  min.node.size  ROC        Sens       Spec     
##   extratrees  10    1              0.8165927  0.7037461  0.7881626
##   extratrees  10    5              0.8168939  0.7000141  0.7932919
##   extratrees  20    1              0.8196030  0.7035087  0.7931762
##   extratrees  20    5              0.8191942  0.6941828  0.8070530
##   extratrees  25    1              0.8193723  0.6998969  0.7991231
##   extratrees  25    5              0.8179915  0.6942990  0.8054191
##   gini        10    1              0.7892622  0.6964107  0.7264911
##   gini        10    5              0.7920299  0.6989754  0.7320894
##   gini        20    1              0.7787087  0.6824193  0.7220609
##   gini        20    5              0.7778494  0.6806713  0.7221780
##   gini        25    1              0.7759767  0.6851025  0.7186807
##   gini        25    5              0.7763475  0.6811368  0.7198466
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
##  and min.node.size = 1.
## 
## attr(,"class")
## [1] "caretList"

Model Perfomance

ROC CI

resamples_models <- resamples(model_list)

dotplot(resamples_models, metric = "ROC")

Model Statistics

nms_models <- names(model_list)
resamples_stat_list <- list()
for (j in 1:length(nms_models)) {
  model1 = model_list[[j]]
  resample_stata <- thresholder(model1, 
                              threshold = seq(.0, 1, by = 0.01), 
                              final = TRUE, 
                              statistics = "all") %>% setDT()
  p= ggplot(resample_stata , aes(x = prob_threshold, y = F1, col = "F1")) + 
  geom_point() + 
  geom_point(aes(y = Sensitivity, col = "Sensitivity"))+
  scale_x_continuous(breaks = seq(0, 1, by =.1))+
    ggtitle(nms_models[j])
  print(p)
  resample_stata[, model:= nms_models[j]]
  resamples_stat_list[[j]] = resample_stata
}

ROC CURVE

resamples_combined <- rbindlist(resamples_stat_list, fill = TRUE)
library(plotly)
ggplotly(ggplot(resamples_combined  , aes(x = 1-Specificity, y = Recall, color = model)) + 
  geom_line(size = 1) + 
  #geom_point(aes(y = Sensitivity, col = "Sensitivity"))+
  scale_x_continuous(breaks = seq(0, 1, by =.1)) +
  ggtitle(paste0("ROC for models"))+
  scale_color_viridis_d())

Precision Recall Curve

ggplotly(ggplot(resamples_combined ,
                aes( x = Recall, y = Precision, color = model)) + 
  geom_line(size = 1) + 
  #geom_point(aes(y = Sensitivity, col = "Sensitivity"))+
  scale_x_continuous(breaks = seq(0, 1, by =.1))+
  scale_color_viridis_d()+
  ggtitle(paste0("Precision recall curve")))

Variable Importance

IML models

library(iml)
X_pred <-train_sampled[, .SD, .SDcols = !c("customerid", "good_bad_flag")] %>%
  as.data.frame()

nms_models <- names(model_list)

iml_models <- list()

for (i in 1:length(nms_models)) {
  
  chain_rf_a <- model_list[[i]]
  pred <- function(chain_rf_a, train_sampled)  {
    results <- predict(chain_rf_a, newdata = train_sampled, type = "prob")
    return(results[[1L]])
  }
  
  # it does not know how to deal with char values


# get predicted values
  iml_models[[i]] <- Predictor$new(model = chain_rf_a, 
                      data =X_pred,
                      predict.function = pred,
                      y = train_sampled$good_bad_flag)


}

Feature Importance plots

plots <- list()
for (i in 1:length(nms_models)) {
  model_this = iml_models[[i]]
  impa <- FeatureImp$new(model_this, loss = "ce")
  var_importanta <-impa$results %>% data.table()

  #write.csv(var_importanta, file = "var_importanta.csv", row.names = F)
  setorder(var_importanta, -importance)
  var10a <- var_importanta[1:20]
  if(i == 2) write.csv(var10a, file = "svm_var.csv", row.names = F)
  plots[[i]] <- ggplot(var10a, aes(reorder(feature,importance), importance))+
  geom_point()+
  ggtitle(nms_models[i])+
   geom_linerange(aes(ymin=importance.05, ymax= importance.95), width=.3,
                  position=position_dodge(width = .7)) +
  coord_flip()
  
  
}

plots
## [[1]]

## 
## [[2]]

## 
## [[3]]

Shap Values calculation

nms <- names(model_list)
ids <- which(nms == "ranger")
shap_list <- vector("list", nrow(X_pred))
model_list_shap <- list()
model_this <- iml_models[[ids]]

tic()

#shap_list[[1]] <- shap_import

for (i in 1:nrow(X_pred)) {
  shap <- Shapley$new(model_this,  x.interest = X_pred[i, ], sample.size = 30)
  shap_import <-shap$results %>% data.table()
  shap_import <- shap_import[class == "Bad"]
  shap_list[[i]] <- shap_import[,
                                customerid := train_sampled[i, customerid]]

  }
toc()
## 17730.95 sec elapsed
shap_values <- rbindlist(shap_list, fill = T)

write.csv(shap_values, file = "shap_values.csv", row.names = F)

Shap Values plot

library(ggforce)
shap_values <-  fread("shap_values.csv")

shap_values[, phi2 := abs(phi)]
shap_imp <- shap_values[, .(Med = median(phi2),
                            Mean = mean(phi2)), by = feature] %>%
    setorder(-Mean)
shap_imp <- shap_imp[1:20, ]

shap_values <- shap_values[feature %in%shap_imp$feature]

shap_values[, feature := factor(feature, levels = rev(shap_imp$feature) )]

ggplot(shap_values, aes(feature, phi,  color = phi.var))+
  geom_sina()+
  geom_hline(yintercept = 0) +
  scale_color_gradient(low="#2187E3", high="#F32858", 
                       breaks=c(0,1), labels=c("Low","High"))+ 
  theme_bw() + 
    theme(axis.line.y = element_blank(), 
          axis.ticks.y = element_blank(), # remove axis line
          legend.position="bottom") +
  coord_flip()